/*******************************************************************************
* Copyright (c) 2011 Stephan Schwiebert. All rights reserved. This program and
* the accompanying materials are made available under the terms of the Eclipse
* Public License v1.0 which accompanies this distribution, and is available at
* http://www.eclipse.org/legal/epl-v10.html
* <p/>
* Contributors: Stephan Schwiebert - initial API and implementation
*******************************************************************************/
package org.activiti.designer.elsag.tagCloud;
import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.text.BreakIterator;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
/**
*
* @author sschwieb
*
*/
public class TypeCollector {
private static String stopWords;
public static List<Type> getData(File file, String encoding) throws IOException {
BufferedInputStream bis = new BufferedInputStream(new FileInputStream(file));
BufferedReader br = new BufferedReader(new InputStreamReader(bis, encoding));
StringBuffer text = new StringBuffer();
String s;
while((s = br.readLine()) != null) {
text.append(s + "\n");
}
br.close();
Set<String> stops = new HashSet<String>();
if(stopWords != null) {
bis = new BufferedInputStream(new FileInputStream(stopWords));
br = new BufferedReader(new InputStreamReader(bis, encoding));
while((s = br.readLine()) != null) {
stops.add(s.toLowerCase().trim());
}
br.close();
}
BreakIterator iterator = BreakIterator.getWordInstance(Locale.getDefault());
String txt = text.toString();
iterator.setText(txt);
final Map<String, Integer> strings = new HashMap<String, Integer>();
int boundary = iterator.first();
int lastBoundary = iterator.first();
while (boundary != BreakIterator.DONE) {
boundary = iterator.next();
if (boundary != -1) {
String string = txt.substring(lastBoundary, boundary).trim();
if (string.length() != 0) {
if (!Character.isLetter(string.charAt(string.length() - 1))) {
string = string.substring(0, string.length() - 1);
}
if (stops.contains(string.toLowerCase()) || string.trim().length() <= 1) {
lastBoundary = boundary;
continue;
}
Integer count = strings.get(string);
if (count == null) {
strings.put(string, 1);
} else {
count = count + 1;
strings.put(string, count);
}
}
}
lastBoundary = boundary;
}
return getMostImportantTypes(strings);
}
private static List<Type> getMostImportantTypes(final Map<String, Integer> strings) {
List<Type> types = new ArrayList<Type>();
Iterator<Entry<String, Integer>> iterator = strings.entrySet().iterator();
while(iterator.hasNext()) {
Entry<String, Integer> entry = iterator.next();
Type type = new Type(entry.getKey(), entry.getValue());
types.add(type);
}
List<Type> sorted = new ArrayList<Type>(types);
Collections.sort(sorted, new Comparator<Type>() {
@Override
public int compare(Type o1, Type o2) {
return o2.getOccurrences() - o1.getOccurrences();
}
});
return sorted;
}
public static void setStopwords(String sourceFile) {
stopWords = sourceFile;
}
}